home *** CD-ROM | disk | FTP | other *** search
/ Enter 2006 September / Enter 09 2006.iso / Internet / SpamExperts Home 1.1 / SpamExperts Home.exe / lib / spamexperts.modules / spambayes / Stats.pyc (.txt) < prev    next >
Encoding:
Python Compiled Bytecode  |  2006-07-14  |  12.6 KB  |  311 lines

  1. # Source Generated with Decompyle++
  2. # File: in.pyc (Python 2.4)
  3.  
  4. """Stats.py - SpamBayes statistics class.
  5.  
  6. Classes:
  7.     Stats - provides statistical information about previous activity.
  8.  
  9. Abstract:
  10.  
  11.     Provide statistics on the activity that spambayes has done - for
  12.     example the number of messages classified as each type, and the
  13.     number of messages trained as each type.  This information is
  14.     retrieved from the messageinfo database, so is as reliable as that
  15.     is <wink>.
  16.  
  17.     This class provides information for both the web interface, the
  18.     Outlook plug-in, and sb_pop3dnd.
  19.  
  20. To Do:
  21.     o People would like pretty graphs, so maybe that could be done.
  22.       The trick is to find some way to make them, ideally without
  23.       requiring something like PIL to be installed.
  24.     o People have requested time-based statistics - mail per hour,
  25.       spam per hour, and so on.
  26.       Discussion on spambayes-dev indicated that this would be a lot
  27.       of work for not much gain; however, since we now have some
  28.       time data stored, it wouldn't be too bad, so maybe it can go in.
  29.     o Suggestions?
  30. """
  31. __author__ = 'Tony Meyer <ta-meyer@ihug.co.nz>'
  32. __credits__ = 'Kenny Pitt, Mark Hammond, all the spambayes folk.'
  33.  
  34. try:
  35.     (True, False)
  36. except NameError:
  37.     (True, False) = (1, 0)
  38.  
  39. import time
  40. import types
  41. from spambayes.message import STATS_START_KEY
  42. from spambayes.message import database_type, open_storage, Message
  43.  
  44. try:
  45.     _
  46. except NameError:
  47.     
  48.     _ = lambda arg: arg
  49.  
  50.  
  51. class Stats(object):
  52.     
  53.     def __init__(self, options, messageinfo_db, message_class = Message):
  54.         self.messageinfo_db = messageinfo_db
  55.         self.options = options
  56.         self.message_class = message_class
  57.         self.Reset()
  58.         self.from_date = self.messageinfo_db.get_statistics_start_date()
  59.         self.CalculatePersistentStats()
  60.  
  61.     
  62.     def Reset(self):
  63.         self.num_ham = self.num_spam = self.num_unsure = 0
  64.         self.num_trained_spam = self.num_trained_spam_fn = 0
  65.         self.num_trained_ham = self.num_trained_ham_fp = 0
  66.  
  67.     
  68.     def ResetTotal(self, permanently = False):
  69.         self.totals = { }
  70.         for stat in [
  71.             'num_ham',
  72.             'num_spam',
  73.             'num_unsure',
  74.             'num_trained_spam',
  75.             'num_trained_spam_fn',
  76.             'num_trained_ham',
  77.             'num_trained_ham_fp']:
  78.             self.totals[stat] = 0
  79.         
  80.         if permanently:
  81.             self.from_date = time.time()
  82.             self.messageinfo_db.set_statistics_start_date(self.from_date)
  83.         
  84.  
  85.     
  86.     def RecordClassification(self, score):
  87.         '''Record that a message has been classified this session.'''
  88.         if score >= self.options[('Categorization', 'spam_cutoff')]:
  89.             self.num_spam += 1
  90.         elif score >= self.options[('Categorization', 'ham_cutoff')]:
  91.             self.num_unsure += 1
  92.         else:
  93.             self.num_ham += 1
  94.  
  95.     
  96.     def RecordTraining(self, as_ham, old_score = None, old_class = None):
  97.         '''Record that a message has been trained this session.
  98.  
  99.         If old_score and old_class are None, then the message had not
  100.         previously been trained (e.g. using the "Train" box on the web
  101.         interface), and so cannot be considered a fp or fn).
  102.  
  103.         If both old_score and old_class are specified, old_score is used.
  104.         '''
  105.         pass
  106.  
  107.     
  108.     def CalculatePersistentStats(self):
  109.         '''Calculate the statistics totals (i.e. not this session).
  110.  
  111.         This is done by running through the messageinfo database and
  112.         adding up the various information.  This could get quite time
  113.         consuming if the messageinfo database gets very large, so
  114.         some consideration should perhaps be made about what to do
  115.         then.
  116.         '''
  117.         self.ResetTotal()
  118.         totals = self.totals
  119.         for msg_id in self.messageinfo_db.keys():
  120.             if msg_id == STATS_START_KEY:
  121.                 continue
  122.             
  123.             m = self.message_class(msg_id)
  124.             self.messageinfo_db.load_msg(m)
  125.             if m.date_modified is None:
  126.                 continue
  127.             
  128.             if self.from_date and m.date_modified < self.from_date:
  129.                 continue
  130.             
  131.             classification = m.GetClassification()
  132.             trained = m.GetTrained()
  133.             if classification == self.options[('Headers', 'header_spam_string')]:
  134.                 totals['num_spam'] += 1
  135.                 if trained == False:
  136.                     totals['num_trained_ham_fp'] += 1
  137.                 
  138.             trained == False
  139.             if classification == self.options[('Headers', 'header_ham_string')]:
  140.                 totals['num_ham'] += 1
  141.                 if trained == True:
  142.                     totals['num_trained_spam_fn'] += 1
  143.                 
  144.             trained == True
  145.             if classification == self.options[('Headers', 'header_unsure_string')]:
  146.                 totals['num_unsure'] += 1
  147.                 if trained == False:
  148.                     totals['num_trained_ham'] += 1
  149.                 elif trained == True:
  150.                     totals['num_trained_spam'] += 1
  151.                 
  152.             trained == False
  153.         
  154.  
  155.     
  156.     def _CombineSessionAndTotal(self):
  157.         totals = self.totals
  158.         data = { }
  159.         data['num_ham'] = self.num_ham + totals['num_ham']
  160.         data['num_spam'] = self.num_spam + totals['num_spam']
  161.         data['num_unsure'] = self.num_unsure + totals['num_unsure']
  162.         data['num_seen'] = data['num_ham'] + data['num_spam'] + data['num_unsure']
  163.         data['num_trained_ham'] = self.num_trained_ham + totals['num_trained_ham']
  164.         data['num_trained_ham_fp'] = self.num_trained_ham_fp + totals['num_trained_ham_fp']
  165.         data['num_trained_spam'] = self.num_trained_spam + totals['num_trained_spam']
  166.         data['num_trained_spam_fn'] = self.num_trained_spam_fn + totals['num_trained_spam_fn']
  167.         return data
  168.  
  169.     
  170.     def _CalculateAdditional(self, data):
  171.         data['perc_ham'] = 100.0 * data['num_ham'] / data['num_seen']
  172.         data['perc_spam'] = 100.0 * data['num_spam'] / data['num_seen']
  173.         data['perc_unsure'] = 100.0 * data['num_unsure'] / data['num_seen']
  174.         data['num_ham_correct'] = data['num_ham'] - data['num_trained_spam_fn']
  175.         data['num_spam_correct'] = data['num_spam'] - data['num_trained_ham_fp']
  176.         data['num_correct'] = data['num_ham_correct'] + data['num_spam_correct']
  177.         data['num_incorrect'] = data['num_trained_spam_fn'] + data['num_trained_ham_fp']
  178.         data['perc_correct'] = 100.0 * data['num_correct'] / data['num_seen']
  179.         data['perc_incorrect'] = 100.0 * data['num_incorrect'] / data['num_seen']
  180.         data['perc_fp'] = 100.0 * data['num_trained_ham_fp'] / data['num_seen']
  181.         data['perc_fn'] = 100.0 * data['num_trained_spam_fn'] / data['num_seen']
  182.         data['num_unsure_trained_ham'] = data['num_trained_ham'] - data['num_trained_ham_fp']
  183.         data['num_unsure_trained_spam'] = data['num_trained_spam'] - data['num_trained_spam_fn']
  184.         data['num_unsure_not_trained'] = data['num_unsure'] - data['num_unsure_trained_ham'] - data['num_unsure_trained_spam']
  185.         if data['num_unsure']:
  186.             data['perc_unsure_trained_ham'] = 100.0 * data['num_unsure_trained_ham'] / data['num_unsure']
  187.             data['perc_unsure_trained_spam'] = 100.0 * data['num_unsure_trained_spam'] / data['num_unsure']
  188.             data['perc_unsure_not_trained'] = 100.0 * data['num_unsure_not_trained'] / data['num_unsure']
  189.         
  190.         data['total_ham'] = data['num_ham_correct'] + data['num_trained_ham']
  191.         data['total_spam'] = data['num_spam_correct'] + data['num_trained_spam']
  192.         if data['total_ham']:
  193.             data['perc_ham_incorrect'] = 100.0 * data['num_trained_ham_fp'] / data['total_ham']
  194.             data['perc_ham_unsure'] = 100.0 * data['num_unsure_trained_ham'] / data['total_ham']
  195.             data['perc_ham_incorrect_or_unsure'] = 100.0 * (data['num_trained_ham_fp'] + data['num_unsure_trained_ham']) / data['total_ham']
  196.         
  197.         if data['total_spam']:
  198.             data['perc_spam_correct'] = 100.0 * data['num_spam_correct'] / data['total_spam']
  199.             data['perc_spam_unsure'] = 100.0 * data['num_unsure_trained_spam'] / data['total_spam']
  200.             data['perc_spam_correct_or_unsure'] = 100.0 * (data['num_spam_correct'] + data['num_unsure_trained_spam']) / data['total_spam']
  201.         
  202.         fp_cost = self.options[('TestDriver', 'best_cutoff_fp_weight')]
  203.         fn_cost = self.options[('TestDriver', 'best_cutoff_fn_weight')]
  204.         unsure_cost = self.options[('TestDriver', 'best_cutoff_unsure_weight')]
  205.         data['total_cost'] = data['num_trained_ham_fp'] * fp_cost + data['num_trained_spam_fn'] * fn_cost + data['num_unsure'] * unsure_cost
  206.         no_filter_cost = data['num_spam'] * fn_cost
  207.         data['cost_savings'] = no_filter_cost - data['total_cost']
  208.         return data
  209.  
  210.     
  211.     def _AddPercentStrings(self, data, dp):
  212.         data['perc_ham_s'] = '%%(perc_ham).%df%%(perc)s' % (dp,)
  213.         data['perc_spam_s'] = '%%(perc_spam).%df%%(perc)s' % (dp,)
  214.         data['perc_unsure_s'] = '%%(perc_unsure).%df%%(perc)s' % (dp,)
  215.         data['perc_correct_s'] = '%%(perc_correct).%df%%(perc)s' % (dp,)
  216.         data['perc_incorrect_s'] = '%%(perc_incorrect).%df%%(perc)s' % (dp,)
  217.         data['perc_fp_s'] = '%%(perc_fp).%df%%(perc)s' % (dp,)
  218.         data['perc_fn_s'] = '%%(perc_fn).%df%%(perc)s' % (dp,)
  219.         data['perc_spam_correct_s'] = '%%(perc_spam_correct).%df%%(perc)s' % (dp,)
  220.         data['perc_spam_unsure_s'] = '%%(perc_spam_unsure).%df%%(perc)s' % (dp,)
  221.         data['perc_spam_correct_or_unsure_s'] = '%%(perc_spam_correct_or_unsure).%df%%(perc)s' % (dp,)
  222.         data['perc_ham_incorrect_s'] = '%%(perc_ham_incorrect).%df%%(perc)s' % (dp,)
  223.         data['perc_ham_unsure_s'] = '%%(perc_ham_unsure).%df%%(perc)s' % (dp,)
  224.         data['perc_ham_incorrect_or_unsure_s'] = '%%(perc_ham_incorrect_or_unsure).%df%%(perc)s' % (dp,)
  225.         data['perc_unsure_trained_ham_s'] = '%%(perc_unsure_trained_ham).%df%%(perc)s' % (dp,)
  226.         data['perc_unsure_trained_spam_s'] = '%%(perc_unsure_trained_spam).%df%%(perc)s' % (dp,)
  227.         data['perc_unsure_not_trained_s'] = '%%(perc_unsure_not_trained).%df%%(perc)s' % (dp,)
  228.         data['perc'] = '%'
  229.         return data
  230.  
  231.     
  232.     def GetStats(self, use_html = False, session_only = False, decimal_points = 1):
  233.         '''Return a description of the statistics.
  234.  
  235.         If session_only is True, then only a description of the statistics
  236.         since we were last reset.  Otherwise, lifetime statistics (i.e.
  237.         those including the ones loaded).
  238.  
  239.         Users probably care most about persistent statistics, so present
  240.         those by default.  If session-only stats are desired, then a
  241.         special call to here can be made.
  242.  
  243.         The percentages will be accurate to the given number of decimal
  244.         points.
  245.  
  246.         If use_html is True, then the returned data is marked up with
  247.         appropriate HTML, otherwise it is plain text.
  248.         '''
  249.         chunks = []
  250.         push = chunks.append
  251.         if session_only:
  252.             data = { }
  253.             data['num_seen'] = self.num_ham + self.num_spam + self.num_unsure
  254.             data['num_ham'] = self.num_ham
  255.             data['num_spam'] = self.num_spam
  256.             data['num_unsure'] = self.num_unsure
  257.             data['num_trained_ham'] = self.num_trained_ham
  258.             data['num_trained_ham_fp'] = self.num_trained_ham_fp
  259.             data['num_trained_spam'] = self.num_trained_spam
  260.             data['num_trained_spam_fn'] = self.num_trained_spam_fn
  261.         else:
  262.             data = self._CombineSessionAndTotal()
  263.         push(_('Messages classified: %d') % (data['num_seen'],))
  264.         if data['num_seen'] == 0:
  265.             return chunks
  266.         
  267.         data = self._CalculateAdditional(data)
  268.         format_dict = self._AddPercentStrings(data, decimal_points)
  269.         if use_html:
  270.             format_dict['tab'] = '    '
  271.         else:
  272.             format_dict['tab'] = '\t'
  273.         push(_('%(tab)sGood:%(tab)s%(num_ham)d (%(perc_ham_s)s)') % format_dict % format_dict)
  274.         push(_('%(tab)sSpam:%(tab)s%(num_spam)d (%(perc_spam_s)s)') % format_dict % format_dict)
  275.         push(_('%(tab)sUnsure:%(tab)s%(num_unsure)d (%(perc_unsure_s)s)') % format_dict % format_dict)
  276.         push('')
  277.         push(_('Classified correctly:%(tab)s%(num_correct)d (%(perc_correct_s)s of total)') % format_dict % format_dict)
  278.         push(_('Classified incorrectly:%(tab)s%(num_incorrect)d (%(perc_incorrect_s)s of total)') % format_dict % format_dict)
  279.         if format_dict['num_incorrect']:
  280.             push(_('%(tab)sFalse positives:%(tab)s%(num_trained_ham_fp)d (%(perc_fp_s)s of total)') % format_dict % format_dict)
  281.             push(_('%(tab)sFalse negatives:%(tab)s%(num_trained_spam_fn)d (%(perc_fn_s)s of total)') % format_dict % format_dict)
  282.         
  283.         push('')
  284.         push(_('Manually classified as good:%(tab)s%(num_trained_ham)d') % format_dict)
  285.         push(_('Manually classified as spam:%(tab)s%(num_trained_spam)d') % format_dict)
  286.         push('')
  287.         if format_dict['num_unsure']:
  288.             push(_('Unsures trained as good:%(tab)s%(num_unsure_trained_ham)d (%(perc_unsure_trained_ham_s)s of unsures)') % format_dict % format_dict)
  289.             push(_('Unsures trained as spam:%(tab)s%(num_unsure_trained_spam)d (%(perc_unsure_trained_spam_s)s of unsures)') % format_dict % format_dict)
  290.             push(_('Unsures not trained:%(tab)s%(tab)s%(num_unsure_not_trained)d (%(perc_unsure_not_trained_s)s of unsures)') % format_dict % format_dict)
  291.             push('')
  292.         
  293.         if format_dict['total_spam']:
  294.             push(_('Spam correctly identified:%(tab)s%(perc_spam_correct_s)s (+ %(perc_spam_unsure_s)s unsure)') % format_dict % format_dict)
  295.         
  296.         if format_dict['total_ham']:
  297.             push(_('Good incorrectly identified:%(tab)s%(perc_ham_incorrect_s)s (+ %(perc_ham_unsure_s)s unsure)') % format_dict % format_dict)
  298.         
  299.         if format_dict['total_spam'] or format_dict['total_ham']:
  300.             push('')
  301.         
  302.         push(_('Total cost of spam:%(tab)s$%(total_cost).2f') % format_dict)
  303.         push(_('SpamBayes savings:%(tab)s$%(cost_savings).2f') % format_dict)
  304.         return chunks
  305.  
  306.  
  307. if __name__ == '__main__':
  308.     s = Stats()
  309.     print '\n'.join(s.GetStats())
  310.  
  311.